import time
import re
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import warnings
warnings.filterwarnings('ignore')

timeList = []  # 发表时间
scoreList = [] # 评分
comments = []  # 评论文本
def getData(driver, ddl1, j):
    '''获取数据'''
    times = driver.find_elements(By.CSS_SELECTOR, '.commentTime')
    scores = driver.find_elements(By.CSS_SELECTOR, '.averageScore')[1:]
    comment = driver.find_elements(By.CSS_SELECTOR, '.commentDetail')

    for c,t,s in zip(comment, times, scores):
        try:
            timeList.append(re.findall(r'(\d{4}-\d{1,2}-\d{1,2})', t.text)[0])
            scoreList.append(re.findall(r"(.*)分", s.text)[0])
            comments.append(c.text)
        except:
            pass

    print(f"共{int(ddl1)}页，第{j}页下载完成...")

if __name__ == '__main__':
    ids=['外滩','上海迪士尼度假区','东方明珠','上海海昌海洋公园','豫园',\
         '上海野生动物园','上海博物馆','上海天文馆','黄浦江','上海中心大厦',\
         '上海海洋水族馆','上海之巅观光厅','城隍庙旅游区','上海动物园','迪士尼小镇','上海欢乐谷','上海佘山国家旅游度假区','陆家嘴']
    urls=['https://you.ctrip.com/sight/shanghai2/736.html','https://you.ctrip.com/sight/shanghai2/1412255.html','https://you.ctrip.com/sight/shanghai2/762.html',\
          'https://you.ctrip.com/sight/shanghai2/4651499.html','https://you.ctrip.com/sight/shanghai2/740.html','https://you.ctrip.com/sight/shanghai2/758.html',\
          'https://you.ctrip.com/sight/shanghai2/3733.html','https://you.ctrip.com/sight/shanghai2/69572753.html','https://you.ctrip.com/sight/shanghai2/1815443.html',\
          'https://you.ctrip.com/sight/shanghai2/1827363.html', 'https://you.ctrip.com/sight/shanghai2/18722.html', 'https://you.ctrip.com/sight/shanghai2/2005734.html',\
          'https://you.ctrip.com/sight/shanghai2/5691.html', 'https://you.ctrip.com/sight/shanghai2/25506.html', 'https://you.ctrip.com/sight/shanghai2/1936675.html',\
          'https://you.ctrip.com/sight/shanghai2/65957.html', 'https://you.ctrip.com/sight/shanghai2/5140522.html', 'https://you.ctrip.com/sight/shanghai2/1815444.html',\
          ]
    for j in range(0,len(ids)):
        id = ids[j]
        url = urls[j]
        i = 500

        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36')
        service = Service(executable_path="D:\Anaconda3\Scripts\chromedriver-win64\chromedriver.exe")
        driver = webdriver.Chrome(service =service ,options=options)  # 替换为 ChromeDriver 的实际路径
        driver.maximize_window()

        try:
            driver.get(url)
            time.sleep(4)

            # 获取总的页码
            ddl = driver.find_elements(By.CSS_SELECTOR, '.ant-pagination')
            for t in ddl:
                ddl1= t.text.split("\n")[-2]
            j = 1

            while True:
                t1 = random.uniform(2, 3)
                # 设置随机间隔时间

                getData(driver, ddl1, j)  # 获取数据
                j += 1
                # 翻页
                # element = driver.find_element(By.CSS_SELECTOR, '.ant-pagination-next')
                # element.click()
                xyy = driver.find_element(By.CSS_SELECTOR, value=r'.ant-pagination-next')  # 只需要修改这句
                driver.execute_script("arguments[0].click();", xyy)

                if j == int(ddl1) +1 or j > i:
                    break

                time.sleep(t1)

        finally:
            driver.close()

        # save
        data = pd.DataFrame({ "date": timeList,"评分": scoreList, "comments": comments })
        data.to_csv(f"./data/result_{id}.csv", encoding='utf8')
        print("**********done***********")
    print(all_done)